library(tidyverse) # for general analysis
library(fpp)
library(fpp3) # for predictions
library(ggthemes) # for beautiful themes
library(TSA) # for TS analysis
library(kableExtra) # for beautiful tables
library(lubridate) # for time data
library(tsibble) # for tsiblle data format
library(caret) # for modeling
library(stats)
library(ggplot2)
library(MuMIn)
library(Metrics)
library(tstools)
# Set a theme
theme_set(theme_minimal())
Data is imported and transformed into a time series object.
sales_month <- read_csv("Zillow monthly raw data (full).csv") #import raw data file
chicago_sales_m <- sales_month %>% filter(RegionName == "Chicago, IL") #import Chicago data only
chicago_sales_m <- chicago_sales_m %>% #select median price and data only
gather(key = "Date", value = "Price", -RegionID:-StateName) %>%
select(Date, Price) %>%
rename(Median_price = Price)
chicago_sales_m$Date <- mdy(chicago_sales_m$Date)
dim(chicago_sales_m)
## [1] 151 2
chi_med <- ts(chicago_sales_m$Median_price, start=c(2008,2), frequency=12)
There are two distinctive trends - a negative trend during and in the wake of the 2008 financial crisis (2008-2013), followed by a positive trend until the present. There is also clear additive seasonality. However, this seasonal trend has been impacted by the current COVID-19 pandemic.
autoplot(chi_med, main="Chicago: Median Home Sale Price")
Omitted COVID-19 from time series due to difficulty predicting values in earlier analysis. It also allows us to work with 12 complete years of data.
chi_median <- ts(chi_med, start=c(2008,2), end=c(2020,1), frequency=12)
autoplot(chi_median, main="Chicago: Median Home Sale Price")
Data is complete, and in appropriate form.
sum(is.na(chi_median)) #check for null values
## [1] 0
frequency(chi_median) #check correct frequency of time series
## [1] 12
cycle(chi_median) #check appropriate structure of time series
## Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
## 2008 2 3 4 5 6 7 8 9 10 11 12
## 2009 1 2 3 4 5 6 7 8 9 10 11 12
## 2010 1 2 3 4 5 6 7 8 9 10 11 12
## 2011 1 2 3 4 5 6 7 8 9 10 11 12
## 2012 1 2 3 4 5 6 7 8 9 10 11 12
## 2013 1 2 3 4 5 6 7 8 9 10 11 12
## 2014 1 2 3 4 5 6 7 8 9 10 11 12
## 2015 1 2 3 4 5 6 7 8 9 10 11 12
## 2016 1 2 3 4 5 6 7 8 9 10 11 12
## 2017 1 2 3 4 5 6 7 8 9 10 11 12
## 2018 1 2 3 4 5 6 7 8 9 10 11 12
## 2019 1 2 3 4 5 6 7 8 9 10 11 12
## 2020 1
min(chi_median) #$149,000, which correspondts to February 2013
## [1] 149000
max(chi_median) #$258,500 which corresponds to July 2008
## [1] 258500
Decomposition of the time series validates initial impression of the data. However there appears to be more than one seasonal trend, evidenced by the “seasonal” and “random” decompsitions.
plot(decompose(chi_median))
Median price peaks during the summer months. The range between median prices is also narrower in the spring/summer months than in the rest of the year.
boxplot(chi_median~cycle(chi_median), xlab="Cycle (Month)", ylab="Dollars", main="Chicago: Median Home Sales Price")
Both the ACF and PACF decay sinusodially slowly over time, indicating seasonality and a significant linear relationship between the series and its lags.
acf(chi_median, main="ACF Chicago Median Home Sales", lag.max=50)
pacf(chi_median, main="PACF Chicago Median Home Sales", lag.max=50)
Looking at the Raw Periodogram, we can see that there are many spikes and therefore many dominating frequencies. The periodgram has two spikes, which will be most important to the overall signal.
spectrum(chi_median)
periodogram(chi_median)
Create train/test split for modelling.
chi_train<-window(chi_median, end = 2019.05) #end train in Jan 2019
chi_test<-window(chi_median, start = 2019.05) #use last year of dataset as test
#double-check time series structure
frequency(chi_train)
## [1] 12
cycle(chi_train)
## Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
## 2008 2 3 4 5 6 7 8 9 10 11 12
## 2009 1 2 3 4 5 6 7 8 9 10 11 12
## 2010 1 2 3 4 5 6 7 8 9 10 11 12
## 2011 1 2 3 4 5 6 7 8 9 10 11 12
## 2012 1 2 3 4 5 6 7 8 9 10 11 12
## 2013 1 2 3 4 5 6 7 8 9 10 11 12
## 2014 1 2 3 4 5 6 7 8 9 10 11 12
## 2015 1 2 3 4 5 6 7 8 9 10 11 12
## 2016 1 2 3 4 5 6 7 8 9 10 11 12
## 2017 1 2 3 4 5 6 7 8 9 10 11 12
## 2018 1 2 3 4 5 6 7 8 9 10 11 12
## 2019 1
frequency(chi_test)
## [1] 12
cycle(chi_test)
## Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
## 2019 2 3 4 5 6 7 8 9 10 11 12
## 2020 1
The Holt-Winters Seasonal Method was selected based on its appropriateness for time series with both a trend and seasonality. Two additive HW models were created - one with and one without damping.
The first model, which does not include damping, does not completely capture the autocorrelation in the time series. There remains significant autocorellation in the residuals, and the residuals are not normally distributed.
HW1 <- hw(chi_train, seasonal = "additive",h=12, damped = FALSE)
plot(HW1)
checkresiduals(HW1)
##
## Ljung-Box test
##
## data: Residuals from Holt-Winters' additive method
## Q* = 61.568, df = 8, p-value = 2.293e-10
##
## Model df: 16. Total lags used: 24
Adding damping does not improve the model. The residuals do not resemble white noise.
HW2 <- hw(chi_train, seasonal = "additive",h=12, damped = TRUE)
plot(HW2)
checkresiduals(HW2)
##
## Ljung-Box test
##
## data: Residuals from Damped Holt-Winters' additive method
## Q* = 60.547, df = 7, p-value = 1.174e-10
##
## Model df: 17. Total lags used: 24
From the EDA, we know that there is significant seasonality in the data, suggesting that a sArima model might be appropriate. We begin exploring ways to make the data stationary.
We first check if a Box-Cox Transformation is necessary, and find that lambda should be set to 1.999 (rounded to 2 for simplicity). Once the Box-Cox Transformation is completed, a KPSS Test for stationarity is completed. The series is not stationary.
BoxCox.lambda(chi_train) # lambda = 1.999924, significantly different from 1
## [1] 1.999924
lambda <- 2 # round up lambda value to 2 for simplicity
transformed <- BoxCox(chi_train, lambda=lambda) # BoxCox Transformation
kpss.test(transformed) # p=0.03, data post-Box-Cox transformation is not stationary
##
## KPSS Test for Level Stationarity
##
## data: transformed
## KPSS Level = 0.55011, Truncation lag parameter = 4, p-value = 0.03038
The time series becomes stationary after 1st order differencing.
differenced <- diff(transformed) #difference transformed data
kpss.test(differenced) # KPSS test - p=0.1 > 0.05, data is stationary after Box-Cox transformation and 1 round of differencing
## Warning in kpss.test(differenced): p-value greater than printed p-value
##
## KPSS Test for Level Stationarity
##
## data: differenced
## KPSS Level = 0.096927, Truncation lag parameter = 4, p-value = 0.1
tsdisplay(differenced, lag=50) #visualize new dataset
periodogram(differenced) #uncertain of if we only look at the periodgram for raw data
A sArima model is built using auto.arima, with differencing=1 and lambda=2. This results in an ARIMA(1,1,0)(0,1,2)[12] model. The p-value for the Ljung-Box test is 0.05097. This is on the border of being statistically significant, assuming we use a 0.05 benchmark. However, 0.05 is arbitrary and for the purpose of this project, we believe that p=0.051 is an acceptable indicator that the residuals resemble white noise.
AR1 <- auto.arima(chi_train, d=1, lambda=lambda)
summary(AR1)
## Series: chi_train
## ARIMA(1,1,0)(0,1,2)[12]
## Box Cox transformation: lambda= 2
##
## Coefficients:
## ar1 sma1 sma2
## -0.3102 -0.2970 -0.2017
## s.e. 0.0971 0.1126 0.1121
##
## sigma^2 estimated as 1.049e+18: log likelihood=-2637.67
## AIC=5283.34 AICc=5283.69 BIC=5294.46
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 1069.989 4941.647 3764.568 0.5665218 1.917839 0.2884911
## ACF1
## Training set 0.008751159
checkresiduals(AR1)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,1,0)(0,1,2)[12]
## Q* = 32.59, df = 21, p-value = 0.05097
##
## Model df: 3. Total lags used: 24
plot(forecast(AR1))
Further exploration is necessary to see if a superior sArima model can be built.
eacf(chi_train)
## AR/MA
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13
## 0 x x x x x x x x x x x x x x
## 1 x x o o x x x o o x x x x x
## 2 x o x o o o x o o o o x o o
## 3 x o o o o o o o o o o x x o
## 4 o x x o o o o o o o o x o o
## 5 x o o x o o o o o o o x x x
## 6 x x o x x o o o o o o x x x
## 7 x x o x o o o o o o o x x x
We experiment with different combinations of p=1/2, q=1/2, and P=0/1. However, none of the models pass the Ljung-Box Test for stationarity.
AR2 <- Arima(chi_train, order=c(1,1,1), seasonal=c(0,1,2),lambda=lambda) #p=0.03
summary(AR2)
## Series: chi_train
## ARIMA(1,1,1)(0,1,2)[12]
## Box Cox transformation: lambda= 2
##
## Coefficients:
## ar1 ma1 sma1 sma2
## -0.4387 0.1431 -0.3083 -0.2036
## s.e. 0.1876 0.1990 0.1128 0.1115
##
## sigma^2 estimated as 1.052e+18: log likelihood=-2637.42
## AIC=5284.84 AICc=5285.37 BIC=5298.73
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 1051.221 4921.178 3745.168 0.5573495 1.907905 0.2870044
## ACF1
## Training set -0.005981719
checkresiduals(AR2)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,1,1)(0,1,2)[12]
## Q* = 33.497, df = 20, p-value = 0.02973
##
## Model df: 4. Total lags used: 24
AR3 <- Arima(chi_train, order=c(1,1,0), seasonal=c(1,1,2),lambda=lambda) #p=0.03
summary(AR3)
## Series: chi_train
## ARIMA(1,1,0)(1,1,2)[12]
## Box Cox transformation: lambda= 2
##
## Coefficients:
## ar1 sar1 sma1 sma2
## -0.3211 -0.7318 0.5002 -0.4931
## s.e. 0.0978 0.1078 0.7083 0.3563
##
## sigma^2 estimated as 9.508e+17: log likelihood=-2636.06
## AIC=5282.12 AICc=5282.65 BIC=5296.01
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 962.8262 4629.704 3584.713 0.5117101 1.824064 0.2747082 0.01163099
checkresiduals(AR3)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,1,0)(1,1,2)[12]
## Q* = 32.984, df = 20, p-value = 0.03388
##
## Model df: 4. Total lags used: 24
AR4 <- Arima(chi_train, order=c(1,1,1), seasonal=c(1,1,1),lambda=lambda) #p=0.02
summary(AR4)
## Series: chi_train
## ARIMA(1,1,1)(1,1,1)[12]
## Box Cox transformation: lambda= 2
##
## Coefficients:
## ar1 ma1 sar1 sma1
## -0.4390 0.1464 0.2773 -0.6399
## s.e. 0.1912 0.2030 0.2045 0.1746
##
## sigma^2 estimated as 1.068e+18: log likelihood=-2638.14
## AIC=5286.28 AICc=5286.81 BIC=5300.18
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 1032.062 4937.399 3741.426 0.5457746 1.902978 0.2867176
## ACF1
## Training set -0.00711367
checkresiduals(AR4)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,1,1)(1,1,1)[12]
## Q* = 34.998, df = 20, p-value = 0.02011
##
## Model df: 4. Total lags used: 24
AR5 <- Arima(chi_train, order=c(2,1,0), seasonal=c(0,1,1),lambda=lambda) #p=0.01
summary(AR5)
## Series: chi_train
## ARIMA(2,1,0)(0,1,1)[12]
## Box Cox transformation: lambda= 2
##
## Coefficients:
## ar1 ar2 sma1
## -0.2720 0.0935 -0.4252
## s.e. 0.1059 0.0985 0.1200
##
## sigma^2 estimated as 1.073e+18: log likelihood=-2638.82
## AIC=5285.64 AICc=5285.99 BIC=5296.75
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 901.3052 4925.984 3739.03 0.4798191 1.896996 0.286534 -0.02840107
checkresiduals(AR5)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(2,1,0)(0,1,1)[12]
## Q* = 37.589, df = 21, p-value = 0.01439
##
## Model df: 3. Total lags used: 24
Next, we use the ARIMA(1,1,0)(0,1,2)[12] model to forecast results for the next year. The “Actual vs. Predicted” chart shows that the model does a decent job of forecasting values pre-COVID, but is not capable of accurately predicted the trend for the past few months.
#model 1
AR1_fcast <- forecast(AR1, h=12)
plot(AR1_fcast, chi_test)
tsplot(cbind(chi_test, AR1_fcast$mean), plot_title="Actual vs. Predicted")
mae(AR1_fcast$mean, chi_test)
## [1] 4181.585
rmse(AR1_fcast$mean, chi_test)
## [1] 4559.497
Next we do cross-validation to understand how the model’s performance evolves over time.
k <- 72 # minimum observations (6 years)
n <- length(chi_median) # Number of data points
p <- 12 # Period
H <- 12 # Forecast horizon
defaultW <- getOption("warn")
options(warn = -1)
st <- tsp(chi_median)[1]+(k-2)/p # gives the start time in time units,
error_AR1 <- matrix(NA,n-k,H) # One-year forecast horizon error for each window
AICc_AR1 <- matrix(NA,n-k) # Estimated model AICc value for each wndow
for(i in 1:(n-k))
{
# rolling window
train <- window(chi_median, start=st+(i-k+1)/p, end=st+i/p) ## Window Length: k
test <- window(chi_median, start=st + (i+1)/p, end=st+(i+H)/p) ## Window Length: H
#Arima Models
AR12 <- Arima(train, order=c(1,1,0), seasonal=list(order=c(0,1,2), period=p), lambda='auto', method='ML') # ARIMA(1,1,0)(0,1,2)[12]
fcast_AR1 <- forecast(AR12, h=H)
# Error & AICc
error_AR1[i,1:length(test)] <- fcast_AR1[['mean']]-test
AICc_AR1[i] <- AR12$aicc
}
MAE_AR1 <- colMeans(abs(error_AR1), na.rm=TRUE)
RMSE_AR1 <- sqrt(colMeans(error_AR1**2, na.rm=TRUE))
plot(1:12, MAE_AR1, type="l",col=3,xlab="Horizon", ylab="Error", ylim=c(3600,8000))
lines(1:12, RMSE_AR1, type="l",col=2)
legend("topleft",legend=c("AR1 - MAE", "AR - RMSE"),col=1:4,lty=1)
plot(1:72, AICc_AR1, type="l",col=1,xlab="Iterations", ylab="AICc", ylim=c(750,2650))
legend("bottomright",legend="AR1 - AICc",col=1:4,lty=1)
As previously mentioned, the sArima model was not able to capture the changed environment due to COVID. Therefore, we will try Regression with Arima errors, in order to include other leading predictors.
We tried Google Trends data for two search terms: “homes for sale” and “realtor”.
First, we import the datasets and double-check dimensions.
homes <- read_csv("home for sale.csv") #Google Trends "home for sale" Data
realtor <- read_csv("realtor.csv") #Google Trends "realtor" Data
homes$Month <- yearmonth(homes$Month)
realtor$Month <- yearmonth(realtor$Month)
dim(homes)
## [1] 151 2
dim(realtor)
## [1] 151 2
sum(is.na(homes))
## [1] 0
sum(is.na(realtor))
## [1] 0
Store as TS object
homes_ts <- ts(homes$`Home for sale`, frequency=12, start=c(2008,2), end=c(2020,1))
realtor_ts <- ts(realtor$Realtor, frequency=12, start=c(2008,2), end=c(2020,1))
Visualize the median sale price (scaled) dataset and the two Google Trends datasets. They show similar seasonality, suggesting that they might be appropriate predictors.
chi_median_scale <- chi_median/1000
tsplot(cbind(chi_median_scale, homes_ts, realtor_ts), plot_title="Trends")
Train/test split
home_train<-window(homes_ts, end = 2019.05) #end train in August 2019
home_test<-window(homes_ts, start = 2019.05) #use last year of dataset as test
realtor_train<-window(realtor_ts, end = 2019.05) #end train in August 2019
realtor_test<-window(realtor_ts, start = 2019.05) #use last year of dataset as test
Transform realtor data with a lambda value of 0.3 prior to fitting the regression model.
lambda_realtor <- BoxCox.lambda(realtor_train) # lambda = 1.87, significantly different from 1
transformed_realtor <- BoxCox(realtor_train, lambda=lambda_realtor) # BoxCox Transformation
The regression model using the transformed_realtor dataset does not pass the Ljung-Box test for stationarity (p=0.3587).
RAR1 <- auto.arima(y=chi_train, lambda=lambda, xreg=transformed_realtor) #auto.arima model
summary(RAR1) # b) summary
## Series: chi_train
## Regression with ARIMA(1,1,0)(0,1,2)[12] errors
## Box Cox transformation: lambda= 2
##
## Coefficients:
## ar1 sma1 sma2 xreg
## -0.3116 -0.3106 -0.2004 425096.3
## s.e. 0.0966 0.1136 0.1126 408400.9
##
## sigma^2 estimated as 1.047e+18: log likelihood=-2637.13
## AIC=5284.25 AICc=5284.78 BIC=5298.15
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 1062.713 4907.453 3735.484 0.5620696 1.901453 0.2862623
## ACF1
## Training set -0.0007944304
checkresiduals(RAR1) # c) check residuals
##
## Ljung-Box test
##
## data: Residuals from Regression with ARIMA(1,1,0)(0,1,2)[12] errors
## Q* = 32.757, df = 20, p-value = 0.03587
##
## Model df: 4. Total lags used: 24
The “home for sale” dataset did not need to be transformed.
lambda_home <- BoxCox.lambda(home_train) # lambda = 0.43, similar to 1 so won't transform
transformed_home <- BoxCox(home_train, lambda=lambda_home) # BoxCox Transformation
Similar to the other regression model, there remains significant autocrrelation in the residuals for this model as well.
RAR2 <- auto.arima(y=chi_train, lambda=lambda, xreg=transformed_home) #auto.arima model
summary(RAR2) # b) summary
## Series: chi_train
## Regression with ARIMA(0,1,0)(0,1,0)[12] errors
## Box Cox transformation: lambda= 2
##
## Coefficients:
## xreg
## 160586165
## s.e. 93761414
##
## sigma^2 estimated as 1.35e+18: log likelihood=-2652.29
## AIC=5308.58 AICc=5308.68 BIC=5314.14
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 493.8748 5724.653 4172.448 0.2591168 2.136138 0.3197482 -0.3611121
checkresiduals(RAR2) # c) check residuals
##
## Ljung-Box test
##
## data: Residuals from Regression with ARIMA(0,1,0)(0,1,0)[12] errors
## Q* = 67.876, df = 23, p-value = 2.576e-06
##
## Model df: 1. Total lags used: 24
Save as TS object, do train/test split
gtrends <- cbind(homes, realtor$Realtor)
gtrends_ts <- ts(gtrends, frequency=12, start=c(2008,2), end=c(2020,1))
gtrends_ts <- cbind(gtrends_ts[,'Home for sale'], gtrends_ts[,'realtor$Realtor'])
gtrends_train<-window(gtrends_ts, end = 2019.05) #end train in August 2019
gtrends_test<-window(gtrends_ts, start = 2019.05) #use last year of dataset as test
As with other regression models, there remains significant autocorrelation in the residuals. Will experiment with other predictors later time permitting.
RAR3 <- auto.arima(y=chi_train, lambda=lambda, xreg=gtrends_train) #auto.arima model
summary(RAR3) # b) summary
## Series: chi_train
## Regression with ARIMA(0,1,0)(0,1,0)[12] errors
## Box Cox transformation: lambda= 2
##
## Coefficients:
## gtrends_ts[, "Home for sale"] gtrends_ts[, "realtor$Realtor"]
## 15819071 10641279
## s.e. 10454873 14885032
##
## sigma^2 estimated as 1.358e+18: log likelihood=-2652.13
## AIC=5310.26 AICc=5310.47 BIC=5318.59
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 486.6505 5714.013 4162.284 0.2552087 2.129803 0.3189694 -0.365891
checkresiduals(RAR3) # c) check residuals
##
## Ljung-Box test
##
## data: Residuals from Regression with ARIMA(0,1,0)(0,1,0)[12] errors
## Q* = 69.758, df = 22, p-value = 7.219e-07
##
## Model df: 2. Total lags used: 24
Recall Periodgram points to two major spikes.
temp <- periodogram(chi_train)
spec_p <- matrix(round(temp$spec/1e5, 2))
freq_p <- matrix(temp$freq)
cbind(spec_p, freq_p) # two major spikes at 0.00625 and 0.08125
## [,1] [,2]
## [1,] 506115.69 0.007407407
## [2,] 35676.94 0.014814815
## [3,] 1144.62 0.022222222
## [4,] 8029.70 0.029629630
## [5,] 10025.13 0.037037037
## [6,] 7849.82 0.044444444
## [7,] 5659.41 0.051851852
## [8,] 3956.29 0.059259259
## [9,] 12754.36 0.066666667
## [10,] 2951.17 0.074074074
## [11,] 159940.23 0.081481481
## [12,] 49147.41 0.088888889
## [13,] 858.39 0.096296296
## [14,] 148.28 0.103703704
## [15,] 226.85 0.111111111
## [16,] 389.89 0.118518519
## [17,] 489.30 0.125925926
## [18,] 305.07 0.133333333
## [19,] 1024.03 0.140740741
## [20,] 404.58 0.148148148
## [21,] 1523.30 0.155555556
## [22,] 2566.60 0.162962963
## [23,] 8394.61 0.170370370
## [24,] 264.95 0.177777778
## [25,] 616.84 0.185185185
## [26,] 772.83 0.192592593
## [27,] 277.76 0.200000000
## [28,] 376.47 0.207407407
## [29,] 236.36 0.214814815
## [30,] 590.41 0.222222222
## [31,] 526.37 0.229629630
## [32,] 114.09 0.237037037
## [33,] 1229.78 0.244444444
## [34,] 2064.65 0.251851852
## [35,] 106.71 0.259259259
## [36,] 656.63 0.266666667
## [37,] 53.24 0.274074074
## [38,] 205.91 0.281481481
## [39,] 150.02 0.288888889
## [40,] 379.36 0.296296296
## [41,] 196.86 0.303703704
## [42,] 189.96 0.311111111
## [43,] 415.47 0.318518519
## [44,] 216.71 0.325925926
## [45,] 1990.78 0.333333333
## [46,] 72.82 0.340740741
## [47,] 353.82 0.348148148
## [48,] 357.78 0.355555556
## [49,] 798.82 0.362962963
## [50,] 242.78 0.370370370
## [51,] 41.58 0.377777778
## [52,] 450.60 0.385185185
## [53,] 369.11 0.392592593
## [54,] 313.05 0.400000000
## [55,] 140.36 0.407407407
## [56,] 638.75 0.414814815
## [57,] 339.75 0.422222222
## [58,] 64.48 0.429629630
## [59,] 543.94 0.437037037
## [60,] 109.46 0.444444444
## [61,] 198.45 0.451851852
## [62,] 23.81 0.459259259
## [63,] 389.30 0.466666667
## [64,] 827.45 0.474074074
## [65,] 173.57 0.481481481
## [66,] 189.30 0.488888889
## [67,] 674.16 0.496296296
(1/0.00741)/12 # once every 11 years??
## [1] 11.24606
(1/0.08148)/12 # once every year??
## [1] 1.022746
We create a regression model using a Fourier transformation (K=2) as the predictor, and Arima errors. This results in an ARIMA(0,2,3)(1,0,0)[12] model. The residuals are stationary and relatively normally distributed, signaling that they are white noise (p=0.09553).
DHR1 <- auto.arima(chi_train, xreg=fourier(chi_train, K=2)) # in example seasonal=FALSE, not sure if that should be the setting here?
summary(DHR1)
## Series: chi_train
## Regression with ARIMA(0,2,3)(1,0,0)[12] errors
##
## Coefficients:
## ma1 ma2 ma3 sar1 S1-12 C1-12 S2-12
## -1.2942 0.5532 -0.2389 0.6197 1738.008 -15309.818 -2878.576
## s.e. 0.0843 0.1600 0.1190 0.0724 2275.337 2278.924 1003.122
## C2-12
## 1525.533
## s.e. 997.508
##
## sigma^2 estimated as 25399909: log likelihood=-1292.61
## AIC=2603.21 AICc=2604.71 BIC=2629.02
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 620.9367 4845.17 3818.003 0.293618 1.914033 0.292586 -0.02108458
checkresiduals(DHR1)
##
## Ljung-Box test
##
## data: Residuals from Regression with ARIMA(0,2,3)(1,0,0)[12] errors
## Q* = 23.732, df = 16, p-value = 0.09553
##
## Model df: 8. Total lags used: 24
This model is less accurate than the sArima model used for prediction earlier.
#model 2
DHR1_fcast <- forecast(DHR1, h=12, xreg=fourier(chi_train, K=2, h=12))
plot(DHR1_fcast, chi_test)
tsplot(cbind(chi_test, DHR1_fcast$mean), plot_title="Actual vs. Predicted")
mae(DHR1_fcast$mean, chi_test)
## [1] 5797.028
rmse(DHR1_fcast$mean, chi_test)
## [1] 6226.299
Will come back to this
#min(chi_median) Recall minimum ($149,000) correspondts to February 2013 (61st observation)
acf(chi_median[0:61])
pacf(chi_median[0:61])
eacf(chi_median[0:61])
## AR/MA
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13
## 0 x x x x x x x x x x x x x o
## 1 o o o o o o x o o o o x o o
## 2 x o o o o o x o o o o o o o
## 3 o x o o o o x o o o o o o o
## 4 x o o o o o o o o o o o o o
## 5 x o o o o o o o o o o o o o
## 6 x o o o o o o o o o o x o o
## 7 x o o o o o o o o o o o o o
acf(chi_median[62:144])
pacf(chi_median[62:144])
eacf(chi_median[62:144])
## AR/MA
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13
## 0 x x x x o o o x x x x x x x
## 1 x x x o x x o o o x x x x o
## 2 o o x o o o o o o o o x o o
## 3 x o x o o o o o o o o x x o
## 4 o o x o o o o o o o o x o o
## 5 x x o o o o o o o o o x x o
## 6 x x o o x o o o o o o x x x
## 7 x x o o o o o o o o o x o o
auto.arima(chi_median[0:61], lambda='auto')
## Series: chi_median[0:61]
## ARIMA(0,1,0)
## Box Cox transformation: lambda= 1.999924
##
## sigma^2 estimated as 2.685e+18: log likelihood=-1358.16
## AIC=2718.33 AICc=2718.4 BIC=2720.42
#arimax(y,order=c(1,0,1), xreg=data.frame(x=c(rep(0,200),1:200)))
#xreg=data.frame(x=c(rep(0,200),1:200)))
#?arimax
#xreg = data.frame(x=c(rep(0,61),chi_median[62:144]))
#xreg
#chi_median